/* Copyright (C) 2000-2002 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
*/

#ifndef _UDM_COMMON_H
#define _UDM_COMMON_H

#include <stdio.h> /* for FILE etc. */

#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif

#include "udm_unicode.h"
#include "udm_uniconv.h"

#define uint4 unsigned int
#if (WIN32|WINNT)
#define uint8 unsigned __int64
#else
#define uint8 unsigned long long
#endif

/* Some constants */
#define UDM_LANGPERDOC				16		/* FIXME */
#define UDM_USER_AGENT				"MnogoSearch"
#define UDM_MAXWORDPERQUERY			32

/* Some sizes and others definitions */
#define UDM_MAXDOCSIZE				1024*1024	/* 1 MB  */
#define UDM_DEFAULT_REINDEX_TIME		7*24*60*60	/* 1week */
#define UDM_URLSIZE				127
#define UDM_MAXWORDSIZE				32
#define UDM_MAXDISCWORDSIZE			64
#define UDM_DEFAULT_MAX_HOPS			256
#define UDM_READ_TIMEOUT			30
#define UDM_DOC_TIMEOUT				90
#define UDM_MAXNETERRORS			16
#define UDM_DEFAULT_NET_ERROR_DELAY_TIME	86400
#define UDM_DEFAULT_BAD_SINCE_TIME              15*24*60*60     /* 15 days */


/* storage types */
#define UDM_DBMODE_SINGLE	0
#define UDM_DBMODE_MULTI	1
#define UDM_DBMODE_SINGLE_CRC	2
#define UDM_DBMODE_WORD2URL	3
#define UDM_DBMODE_MULTI_CRC	4
#define UDM_DBMODE_CACHE	5
#define UDM_DBMODE_BLOB		6


/* database open modes */
#define UDM_OPEN_MODE_READ	0
#define UDM_OPEN_MODE_WRITE	1


/* search modes */
#define UDM_ORD_RATE		0
#define UDM_ORD_DATE		1
#define UDM_MODE_ALL		0
#define UDM_MODE_ANY		1
#define UDM_MODE_BOOL		2
#define UDM_MODE_PHRASE		3

/* word match type */
#define UDM_MATCH_FULL		0
#define UDM_MATCH_BEGIN		1
#define UDM_MATCH_SUBSTR	2
#define UDM_MATCH_END		3
#define UDM_MATCH_REGEX		4
#define UDM_MATCH_WILD		5
#define UDM_MATCH_ICASE		1

/* Indexer return codes */
#define UDM_OK			0
#define UDM_ERROR		1
#define UDM_NOTARGET		2
#define UDM_TERMINATED		3


/* Flags for indexing */
#define UDM_FLAG_REINDEX	1
#define UDM_FLAG_SORT_EXPIRED	2
#define UDM_FLAG_SORT_HOPS	4
#define UDM_FLAG_ADD_SERV	8
#define UDM_FLAG_SPELL		16
#define UDM_FLAG_LOAD_LANGMAP	32


/* URLFile actions */
#define UDM_URL_FILE_REINDEX	1
#define UDM_URL_FILE_CLEAR	2
#define UDM_URL_FILE_INSERT	3
#define UDM_URL_FILE_PARSE	4

/* Ispell mode binary flags */
#define UDM_ISPELL_MODE_DB	1
#define UDM_ISPELL_USE_PREFIXES	2
#define UDM_ISPELL_MODE_SERVER  4

/* Action type: HTTP methods */
#define UDM_METHOD_GET		1
#define UDM_METHOD_DISALLOW	2
#define UDM_METHOD_HEAD		3
#define UDM_METHOD_HREFONLY	4
#define UDM_METHOD_CHECKMP3	5
#define UDM_METHOD_CHECKMP3ONLY	6
#define UDM_METHOD_VISITLATER	7

/* Tags/categories defines */
#define UDM_CATSIZE	8
#define UDM_TAGSIZE	4

/* Words origins */
#define UDM_WORD_ORIGIN_QUERY   1
#define UDM_WORD_ORIGIN_SPELL   2
#define UDM_WORD_ORIGIN_SYNONYM 4
#define UDM_WORD_ORIGIN_STOP    8

/************************ Statistics **********************/
typedef struct stat_struct {
	int	status;
	int	expired;
	int	total;
} UDM_STAT;

typedef struct stat_list_struct{
	size_t		nstats;
	UDM_STAT	*Stat;
} UDM_STATLIST;
/************************ VARLISTs ************************/

#define UDM_MAXVARLEVEL		255

typedef struct {
	int		section;	/* Number 0..255 */
	size_t		maxlen;		/* Max length    */
	size_t		curlen;		/* Cur length    */
	char		*val;		/* Field Value   */
	char		*name;		/* Field Name    */
} UDM_VAR;

typedef struct {
	size_t		nvars;
	size_t		level;
	size_t		bottom[UDM_MAXVARLEVEL+1];
	UDM_VAR		*Var;
} UDM_VARLIST;


/************** Language and charset guesser *************/


#define UDM_LM_MAXGRAM		5
#define UDM_LM_HASHMASK		0x03FFF

typedef struct {
	int		count;
        float		p;
	char		str[UDM_LM_MAXGRAM+3];
} UDM_LANGITEM;

typedef struct {
	float		expectation;			/* Average value   */
	char		*lang;				/* Map Language    */
	char		*charset;			/* Map charset     */
	UDM_LANGITEM	memb[UDM_LM_HASHMASK+1];	/* Items list      */
} UDM_LANGMAP;

typedef struct {
	size_t		nmaps;
	UDM_LANGMAP	*Map;
} UDM_LANGMAPLIST;

/*****************************************************/

typedef struct {
	char		*str;
	char		*href;
	char		*section_name;
	int		section;
} UDM_TEXTITEM;

typedef struct {
	size_t		nitems;
	UDM_TEXTITEM	*Item;
} UDM_TEXTLIST;

/*****************************************************/

/* StopList unit */
typedef struct udm_stopword_struct {
	char		*word;
	char		*lang;
} UDM_STOPWORD;

typedef struct {
	size_t		nstopwords;
	UDM_STOPWORD	*StopWord;
} UDM_STOPLIST;

/*****************************************************/

/* Words parameters */
typedef struct {
	size_t		min_word_len;
	size_t		max_word_len;
	size_t		correct_factor;
	size_t		incorrect_factor;
	size_t		number_factor;
	size_t		alnum_factor;
} UDM_WORDPARAM;


/* Main search structure */
typedef struct{
	int		url_id;
	uint4		coord;
} UDM_URL_CRD;

typedef struct {
	size_t		ncoords;
	size_t		order;
	char		*word;
	UDM_URL_CRD	*Coords;
} UDM_URLCRDLIST;

typedef struct {
	int		freeme;
	size_t		nlists;
	UDM_URLCRDLIST	*List;
} UDM_URLCRDLISTLIST;

/* Word list unit */
typedef struct {
	int		coord;
	char		*word;
} UDM_WORD;

typedef struct {
	size_t		mwords;	/* Number of memory allocated for words     */
	size_t		nwords;	/* Real number of words in list             */
	size_t		swords;	/* Number of words in sorted list           */
	size_t		wordpos;/* For phrases, number of current word      */
	UDM_WORD	*Word;	/* Word list  itself                        */
} UDM_WORDLIST;


#define UDM_WRDCOORD(p,w)	( (((unsigned int)(p))<<16)+(((unsigned int)(w))<<8) )
#define UDM_WRDSEC(c)		( (((unsigned int)(c))>>8)&0xFF )
#define UDM_WRDPOS(c)		( ((unsigned int)(c))>>16 )
#define UDM_WRDNUM(c)		( ((unsigned int)(c))&0xFF )
#define UDM_WRDMASK(c)		( 1L << (((unsigned int)(c))&0xFF) )


/***************************************************************/

/* Cross-word list unit */
typedef struct {
	short	pos;
	short	weight;
	char	*word;
	char	*url;
	int	referree_id;
} UDM_CROSSWORD;

typedef struct {
	size_t		ncrosswords;
	size_t		mcrosswords;
	size_t		wordpos;
	UDM_CROSSWORD	*CrossWord;
} UDM_CROSSLIST;

/*****************************************************************/

typedef struct {
	int	max_net_errors;
	int	net_error_delay_time;
	int	read_timeout;
	int	doc_timeout;
	int	period;		/* Reindex period           */
	int	maxhops;	/* Max way in mouse clicks  */
	int	index;		/* Whether to index words   */
	int	follow;		/* Whether to follow links  */
	int	use_robots;	/* Whether to use robots.txt*/
	int	use_clones;	/* Whether to detect clones */
} UDM_SPIDERPARAM;

/*****************************************************************/

/* Structure to store server parameters */
typedef struct {
	int		rec_id;		/* to store order of appearence */
	int		match_type;	/* regex/substing,etc       */
	void		*regexp;	/* for realm                */
	char		*url;		/* main argument            */
	char		*alias;		/* For primary aliases      */
	UDM_VARLIST	Vars;		/* Default lang, charset,etc*/
	UDM_VARLIST	ExtraHeaders;	/* Auth and user headers    */
	UDM_SPIDERPARAM	Spider;		/* Spider parameters        */
} UDM_SERVER;

typedef struct {
	size_t		nservers;
	size_t		mservers;
	int		have_subnets;
	UDM_SERVER	*Server;
} UDM_SERVERLIST;


/*******************************************************/
/* All links are stored in the cache of this structure */
/* before actual INSERT into database                  */

typedef struct {
	char	*url;
	char	*tag;
	char	*category;
	int	referrer;
	int	hops;
	int	stored;	
	int	method;
} UDM_HREF;

typedef struct {
	size_t		mhrefs;
	size_t		nhrefs;
	size_t		shrefs;
	size_t		dhrefs;
	UDM_HREF	*Href;
} UDM_HREFLIST;

/*******************************************************/

/* Used in FTP sessions */
typedef struct udm_conn_struct {
        int	status;
        int	connected;
        int	err;
        int	retry;
        int	conn_fd;
        int	port;
        int	timeout;
        char	*hostname;
        struct	sockaddr_in sin;
        int	buf_len;
        int	buf_len_total;
        int	len;
        char	*buf;
        struct	udm_conn_struct *connp;
} UDM_CONN;

/* Parsed URL string */
typedef struct udm_url {
	char	schema[UDM_URLSIZE];
	char	specific[UDM_URLSIZE];
	char	hostinfo[UDM_URLSIZE];
	char	auth[UDM_URLSIZE];
	char	hostname[UDM_URLSIZE];
	char	path[UDM_URLSIZE];
	char	filename[UDM_URLSIZE];
	char	anchor[UDM_URLSIZE];
	int	port;
	int	default_port;
} UDM_URL;


/***************************************************/

typedef struct {
	char	*buf;		/* Buffer to download document to          */
	char	*content;	/* Pointer to content, after headers       */
	size_t	size;		/* Number of bytes loaded                  */
	size_t	maxsize;	/* Maximum bytes to load into buf          */
} UDM_HTTPBUF;

typedef struct {
	int	freeme;		/* Whether  memory was allocated for doc   */
	int	stored;		/* If it is already stored, forAddHref()   */
	
	int	method;		/* How to download document: GET, HEAD etc */
	int	is_mp3;		/* 1 if MP3 tags were found                */
	
	UDM_HTTPBUF		Buf;		/* Buffer       */
	
	UDM_HREFLIST		Hrefs;		/* Link list    */
	UDM_WORDLIST		Words;		/* Word list    */
	UDM_CROSSLIST		CrossWords;	/* Crosswords   */
	
	UDM_VARLIST		RequestHeaders;	/* Extra headers*/
	UDM_VARLIST		Sections;	/* User sections*/
	
	UDM_TEXTLIST		TextList;	/* Text list    */
	UDM_URL			CurURL;		/* Parsed URL   */
	UDM_CHARSET		*lcs;		/* LocalCharser */
	UDM_SPIDERPARAM		Spider;		/* Spider prms  */
	UDM_CONN		connp;		/* For FTP      */
	
} UDM_DOCUMENT;

/********************************************************/

typedef struct {
	int		match_type;
	int		case_sense;
	char		*pattern;
	char		*reg;
} UDM_MATCH;

typedef struct {
	int beg;
	int end;
} UDM_MATCH_PART;

/* Aliases item structure */
typedef struct {
	UDM_MATCH	match;
	char		*replace;
} UDM_ALIAS;

typedef struct {
	size_t		naliases;
	size_t		maliases;
	UDM_ALIAS	*Alias;
} UDM_ALIASLIST;

/* External Parsers */
typedef struct udm_parser_struct{
        char		*from_mime;
	char		*to_mime;
	char		*cmd;
} UDM_PARSER;

typedef struct {
	size_t		nparsers;
	UDM_PARSER	*Parser;
} UDM_PARSERLIST;

/* Resolve stuff */
typedef struct udm_host_addr_struct {
	char		*hostname;
	struct in_addr	addr;
	int		net_errors;
	time_t		last_used;
}UDM_HOST_ADDR;

typedef struct {
	size_t		nhost_addr;
	size_t		mhost_addr;
	UDM_HOST_ADDR	*host_addr;
} UDM_HOSTLIST;

/* Unicode regex lite BEGIN */

#define UDM_UNIREG_SUB	1
#define UDM_UNIREG_BEG	2
#define UDM_UNIREG_END	3
#define UDM_UNIREG_INC	4
#define UDM_UNIREG_EXC	5

typedef struct{
	int		type;
	int		*str;
} UDM_UNIREG_TOK;

typedef struct {
	size_t		ntokens;
	UDM_UNIREG_TOK	*Token;
} UDM_UNIREG_EXP;

/* Unicode regex lite END */


/* Ispell BEGIN */


typedef struct spell_struct {
	int		*word;
	char		flag[11];
	char		lang[33];
} UDM_SPELL;


typedef struct aff_struct {
	char		flag;
	char		type;
	char		lang[33];
	int		mask[41];
        int		find[16];
	int		repl[16];
	UDM_UNIREG_EXP	reg;
	size_t		replen;
        size_t		findlen;
        char		compile;
} UDM_AFFIX;

typedef struct Tree_struct {
	int		Left[256];
	int		Right[256];
        char		lang[3];
} Tree_struct;

typedef struct {
	size_t		naffixes;
	size_t		maffixes;
	UDM_AFFIX	*Affix;
	Tree_struct	PrefixTree[UDM_LANGPERDOC];
	Tree_struct	SuffixTree[UDM_LANGPERDOC];
} UDM_AFFIXLIST;

typedef struct {
	size_t		nspell;
	size_t		mspell;
        int             nLang;
	UDM_SPELL	*Spell;
	Tree_struct	SpellTree[UDM_LANGPERDOC];
} UDM_SPELLLIST;

/* Ispell END */


typedef struct{
	int		cmd; /* 'allow' or 'disallow' */
	char		*path;
} UDM_ROBOT_RULE;

typedef struct{
	char		*hostinfo;
	size_t		nrules;
	UDM_ROBOT_RULE	*Rule;
} UDM_ROBOT;

typedef struct{
	size_t		nrobots;
	UDM_ROBOT	*Robot;
} UDM_ROBOTS;

#define MAX_SEARCH_LIMIT 32

typedef struct{
	int		type;
	char		file_name[1024];
	uint4		hi;
	uint4		lo;
} UDM_SEARCH_LIMIT;



/* Search daemon types */
typedef struct {
	size_t		nitems;
	void		*db;
} UDM_DBLIST;

typedef struct {
	size_t		order;
	size_t		count;
	int		crcword;
	char		*word;
	int		*uword;
	size_t		len;
        int     	origin;
} UDM_WIDEWORD;

typedef struct {
	size_t		nuniq;
	size_t		nwords;
	UDM_WIDEWORD	*Word;
} UDM_WIDEWORDLIST;


typedef struct {
	UDM_WIDEWORD	p;
	UDM_WIDEWORD	s;
} UDM_SYNONYM;

typedef struct {
	size_t		nsynonyms;
	size_t		msynonyms;
	UDM_SYNONYM	*Synonym;
} UDM_SYNONYMLIST;


typedef struct {
	size_t		nmimes;
	size_t		mmimes;
	void		*Mime;
} UDM_MIMELIST;

typedef struct {
	size_t		nfilters;
	size_t		mfilters;
	void		*Filter;
} UDM_FILTERLIST;



typedef struct udm_category_struct {
	int		rec_id;
	char		path[128];
	char		link[128];
	char		name[128];
} UDM_CATITEM;

typedef struct {
	char		addr[128];
	size_t		ncategories;
	UDM_CATITEM	*Category;
} UDM_CATEGORY;

/* Boolean search constants and types */
#define UDM_MAXSTACK	100
#define UDM_STACK_LEFT	0
#define UDM_STACK_RIGHT	1
#define UDM_STACK_BOT	2
#define UDM_STACK_OR	3
#define UDM_STACK_AND	4
#define UDM_STACK_NOT	5
#define UDM_STACK_WORD	200

typedef struct {
	int		ncstack;
	int		cstack[UDM_MAXSTACK];
	int		nastack;
	unsigned long	astack[UDM_MAXSTACK];
} UDM_BOOLSTACK;

typedef struct {
	int		cmd;
	unsigned long	arg;
} UDM_STACK_ITEM;

typedef struct {
	size_t			work_time;
	size_t			first;
	size_t			last;
	size_t			total_found;
	size_t			num_rows;
	size_t			cur_row;
	size_t			offset;
	size_t			memused;
	int			freeme;
	UDM_DOCUMENT		*Doc;
	
	UDM_WIDEWORDLIST	WWList;
	UDM_URLCRDLIST		CoordList;
	
	/* Bool stuff */
	size_t			nitems;
	UDM_STACK_ITEM		items[UDM_MAXSTACK];
	
} UDM_RESULT;




/* Forward declaration of UDM_AGENT */
struct udm_indexer_struct;

/* Config file */
typedef struct udm_config_struct {
	int		freeme;
	char		vardir[1024];
	int		errcode;
	char		errstr[2048];
	UDM_CHARSET	*bcs;
	UDM_CHARSET	*lcs;
	
	int		url_number;	/* For indexer -nXXX          */
        int             bad_since_time; /* try to delete bad hrefs if its older that this */
	
	UDM_SERVER	*csrv;
	UDM_SERVERLIST	Servers;	/* List of servers and realms */
	UDM_ALIASLIST	Aliases;	/* Straight aliases           */
	UDM_ALIASLIST	ReverseAliases;	/* Reverse aliases            */
	UDM_RESULT	Targets;	/* Targets cache              */
	UDM_RESULT	Indexed;	/* Indexed cache              */
	UDM_LANGMAPLIST	LangMaps;	/* For lang+charset quesser   */
	UDM_HREFLIST	Hrefs;		/* Links cache                */
	UDM_VARLIST	Sections;	/* document section to parse  */
	UDM_ROBOTS	Robots;		/* robots.txt information     */
	UDM_SYNONYMLIST	Synonyms;	/* Synonims list              */
	UDM_STOPLIST	StopWords;	/* Stopwords list             */
	UDM_PARSERLIST	Parsers;	/* External  parsers          */
	UDM_MIMELIST	MimeTypes;	/* For AddType commands       */
	UDM_DBLIST	sdcl;		/* Searchd addresses	      */
	UDM_HOSTLIST	Hosts;		/* Resolve cache              */
	UDM_FILTERLIST	Filters;	/* Allow, Disallow,etc        */
	UDM_SPELLLIST	Spells;		/* For ispell dictionaries    */
	UDM_AFFIXLIST	Affixes;	/* For ispell affixes         */
	UDM_VARLIST	Vars;		/* Config parameters          */
	UDM_VARLIST	ExtraHeaders;	/* Headers to send in HTTP    */
	UDM_WORDPARAM	WordParam;	/* Word limits                */
	
	/* Various file descriptors */
	int		logLevel;	/* verbose level: INFO, DEBUG etc    */
	int		is_log_open;	/* if UdmOpenLog is already called   */
	FILE		*logFD;		/* FILE structure, syslog descriptor */
	void		*db;		/* Database handler                     */
	
	void (*ThreadInfo)(struct udm_indexer_struct *,const char * state,const char * str);
	void (*LockProc)(struct udm_indexer_struct *,int command,int type,const char *fname,int lineno);
	void (*RefInfo)(int code,const char *url, const char *ref);
	
} UDM_ENV;



/* Indexer */
typedef struct udm_indexer_struct{
	int		freeme;		/* whenever it was allocated    */
	int		handle;		/* Handler for threaded version */
	time_t		start_time;	/* Time of allocation, for stat */
	size_t		nbytes;		/* Number of bytes downloaded   */
	int		flags;		/* Callback function to request action*/
	int		doccount;	/* for UdmGetDocCount()         */
	UDM_ENV		*Conf;		/* Configuration                */
	UDM_LANGMAP	*LangMap;	/* LangMap for current document */
	
	/* Cache mode limits */
	UDM_SEARCH_LIMIT	limits[MAX_SEARCH_LIMIT];
	int			nlimits;
} UDM_AGENT;



typedef struct {
	char	*url;
	int	status;
} UDM_URLSTATE;

typedef int (*qsort_cmp)(const void*, const void*);

typedef struct {
	uint4	hi,lo;
	uint4	url_id;
} UDM_UINT8_URLID;

typedef struct {
	uint4	hi,lo;
	off_t	pos;
	size_t	len;
} UDM_UINT8_POS_LEN;

typedef struct {
	uint4	val;
	uint4	url_id;
} UDM_UINT4_URLID;

typedef struct {
	uint4	val;
	off_t	pos;
	size_t	len;
} UDM_UINT4_POS_LEN;


#define UDM_SEARCHD_CMD_ERROR	   1
#define UDM_SEARCHD_CMD_MESSAGE	   2
#define UDM_SEARCHD_CMD_WORDS	   3
#define UDM_SEARCHD_CMD_GOODBYE	   4
#define UDM_SEARCHD_CMD_DOCINFO	   5
#define UDM_SEARCHD_CMD_WITHOFFSET 7
#define UDM_SEARCHD_CMD_WWL        8

typedef struct {
	size_t	cmd;
	size_t	len;
} UDM_SEARCHD_PACKET_HEADER;


#define UDM_MAXTAGVAL	64

typedef struct {
	int	type;
	int	script;
	int	style;
	int	title;
	int	body;
	int	follow;
	int	index;
	int	comment;
	char	*lasthref;
        void    (*next_b)(void *t);
        void    (*next_e)(void *t);
        const char *e;
        const char *b;
        const char **lt;
        const char *s;
        int socket;
        int chunks;
        char *Content;
        int finished;
	size_t	ntoks;
	struct {
		const char *name;
		const char *val;
		size_t     nlen;
		size_t     vlen;
	} toks[UDM_MAXTAGVAL+1];
} UDM_HTMLTOK;

#ifndef udm_max
#define udm_max(a,b) (((a) > (b)) ? (a) : (b))
#endif
#ifndef udm_min
#define udm_min(a,b) (((a) < (b)) ? (a) : (b))
#endif

#ifdef DMALLOC
#include <dmalloc.h>
#endif

#endif
